In [7]:
import pandas as pd
import matplotlib.pyplot as plt

# Importing libraries for data preprocessing and clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

Reading the dataset¶

In [9]:
# Read the dataset from a CSV file into a pandas DataFrame
df_pre = pd.read_csv("dataset.csv")
In [10]:
df_pre
Out[10]:
Unnamed: 0 track_id artists album_name track_name popularity duration_ms explicit danceability energy ... loudness mode speechiness acousticness instrumentalness liveness valence tempo time_signature track_genre
0 0 5SuOikwiRyPMVoIQDJUgSV Gen Hoshino Comedy Comedy 73 230666 False 0.676 0.4610 ... -6.746 0 0.1430 0.0322 0.000001 0.3580 0.7150 87.917 4 acoustic
1 1 4qPNDBW1i3p13qLCt0Ki3A Ben Woodward Ghost (Acoustic) Ghost - Acoustic 55 149610 False 0.420 0.1660 ... -17.235 1 0.0763 0.9240 0.000006 0.1010 0.2670 77.489 4 acoustic
2 2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN To Begin Again To Begin Again 57 210826 False 0.438 0.3590 ... -9.734 1 0.0557 0.2100 0.000000 0.1170 0.1200 76.332 4 acoustic
3 3 6lfxq3CG4xtTiEg7opyCyx Kina Grannis Crazy Rich Asians (Original Motion Picture Sou... Can't Help Falling In Love 71 201933 False 0.266 0.0596 ... -18.515 1 0.0363 0.9050 0.000071 0.1320 0.1430 181.740 3 acoustic
4 4 5vjLSffimiIP26QG5WcN2K Chord Overstreet Hold On Hold On 82 198853 False 0.618 0.4430 ... -9.681 1 0.0526 0.4690 0.000000 0.0829 0.1670 119.949 4 acoustic
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
113995 113995 2C3TZjDRiAzdyViavDJ217 Rainy Lullaby #mindfulness - Soft Rain for Mindful Meditatio... Sleep My Little Boy 21 384999 False 0.172 0.2350 ... -16.393 1 0.0422 0.6400 0.928000 0.0863 0.0339 125.995 5 world-music
113996 113996 1hIz5L4IB9hN3WRYPOCGPw Rainy Lullaby #mindfulness - Soft Rain for Mindful Meditatio... Water Into Light 22 385000 False 0.174 0.1170 ... -18.318 0 0.0401 0.9940 0.976000 0.1050 0.0350 85.239 4 world-music
113997 113997 6x8ZfSoqDjuNa5SVP5QjvX Cesária Evora Best Of Miss Perfumado 22 271466 False 0.629 0.3290 ... -10.895 0 0.0420 0.8670 0.000000 0.0839 0.7430 132.378 4 world-music
113998 113998 2e6sXL2bYv4bSz6VTdnfLs Michael W. Smith Change Your World Friends 41 283893 False 0.587 0.5060 ... -10.889 1 0.0297 0.3810 0.000000 0.2700 0.4130 135.960 4 world-music
113999 113999 2hETkH7cOfqmz3LqZDHZf5 Cesária Evora Miss Perfumado Barbincor 22 241826 False 0.526 0.4870 ... -10.204 0 0.0725 0.6810 0.000000 0.0893 0.7080 79.198 4 world-music

114000 rows × 21 columns

Exploratory Data Analysis¶

In [12]:
# Checking the shape of the DataFrame 
df_pre.shape
Out[12]:
(114000, 21)

Dropping genres¶

Several genres have been identified for exclusion from the dataset. These genres are selected on random and mostly slightly different genre of similar sound songs were dropped.

In [14]:
# List of genres to be dropped from the DataFrame
genres_drop = ['ska', 'trip-hop', 'grindcore', 'death-metal', 'metalcore', 'honky-tonk','detroit-techno', 'black-metal', 'new-age', 'sertanejo', 
               'world-music', 'singer-songwriter', 'j-idol', 'j-dance', 'j-rock', 'j-pop','indian', 'bluegrass', 'breakbeat', 'chicago-house', 
               'malay', 'cantopop', 'mandopop', 'rockabilly',  'kids',  'children', 'german', 'progressive-house', 'hardstyle', 'minimal-techno', 
               'mpb', 'study', 'pop-film', 'pagode', 'turkish', 'tango', 'swedish', 'show-tunes', 'anime', 'power-pop', 'dub', 'idm', 'rock-n-roll', 
               'samba', 'sleep']
In [15]:
df = df_pre[~df_pre['track_genre'].isin(genres_drop)]
In [16]:
df
Out[16]:
Unnamed: 0 track_id artists album_name track_name popularity duration_ms explicit danceability energy ... loudness mode speechiness acousticness instrumentalness liveness valence tempo time_signature track_genre
0 0 5SuOikwiRyPMVoIQDJUgSV Gen Hoshino Comedy Comedy 73 230666 False 0.676 0.4610 ... -6.746 0 0.1430 0.032200 0.000001 0.3580 0.7150 87.917 4 acoustic
1 1 4qPNDBW1i3p13qLCt0Ki3A Ben Woodward Ghost (Acoustic) Ghost - Acoustic 55 149610 False 0.420 0.1660 ... -17.235 1 0.0763 0.924000 0.000006 0.1010 0.2670 77.489 4 acoustic
2 2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN To Begin Again To Begin Again 57 210826 False 0.438 0.3590 ... -9.734 1 0.0557 0.210000 0.000000 0.1170 0.1200 76.332 4 acoustic
3 3 6lfxq3CG4xtTiEg7opyCyx Kina Grannis Crazy Rich Asians (Original Motion Picture Sou... Can't Help Falling In Love 71 201933 False 0.266 0.0596 ... -18.515 1 0.0363 0.905000 0.000071 0.1320 0.1430 181.740 3 acoustic
4 4 5vjLSffimiIP26QG5WcN2K Chord Overstreet Hold On Hold On 82 198853 False 0.618 0.4430 ... -9.681 1 0.0526 0.469000 0.000000 0.0829 0.1670 119.949 4 acoustic
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
110995 110995 7sLknEg8aVr0m5ZuCja7b3 NG Rezonance;PHD Syncopy Radio Edits, Vol. 1 Divergence - Radio Edit 28 185142 False 0.148 0.9930 ... -7.696 0 0.0922 0.009700 0.937000 0.0376 0.0928 140.001 4 trance
110996 110996 6veycwSGozeHSFQ6fbr5dC NG Rezonance;PHD Syncopy Radio Edits, Vol. 1 Forgotten - Radio Edit 28 193714 False 0.504 0.9850 ... -7.305 1 0.0504 0.000810 0.922000 0.1250 0.3830 139.978 4 trance
110997 110997 0MLEzWJQcRkc5IMAqucPbV NG Rezonance;Begbie Syncopy Radio Edits, Vol. 1 Feel The Panic - Radio Edit 28 209600 False 0.474 0.9950 ... -4.265 1 0.0979 0.000166 0.369000 0.1500 0.0634 150.002 4 trance
110998 110998 0cRNPYxzXLNLQd1g4kKYS6 NG Rezonance Syncopy Radio Edits, Vol. 1 Fate - Instrumental Radio Edit 28 134800 False 0.416 0.9810 ... -3.653 0 0.0943 0.000079 0.928000 0.1870 0.0662 150.054 3 trance
110999 110999 2dDE3WCSj2cELFYO1IfECD NG Rezonance Syncopy Radio Edits, Vol. 1 Deception - NG Rezonance 2013 Radio Edit 28 162206 False 0.469 0.9870 ... -5.525 0 0.0796 0.000055 0.932000 0.3080 0.2430 145.000 4 trance

69000 rows × 21 columns

In [17]:
df.shape
Out[17]:
(69000, 21)

Dropping NaN values¶

In [19]:
df.dropna(inplace=True)
In [20]:
df
Out[20]:
Unnamed: 0 track_id artists album_name track_name popularity duration_ms explicit danceability energy ... loudness mode speechiness acousticness instrumentalness liveness valence tempo time_signature track_genre
0 0 5SuOikwiRyPMVoIQDJUgSV Gen Hoshino Comedy Comedy 73 230666 False 0.676 0.4610 ... -6.746 0 0.1430 0.032200 0.000001 0.3580 0.7150 87.917 4 acoustic
1 1 4qPNDBW1i3p13qLCt0Ki3A Ben Woodward Ghost (Acoustic) Ghost - Acoustic 55 149610 False 0.420 0.1660 ... -17.235 1 0.0763 0.924000 0.000006 0.1010 0.2670 77.489 4 acoustic
2 2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN To Begin Again To Begin Again 57 210826 False 0.438 0.3590 ... -9.734 1 0.0557 0.210000 0.000000 0.1170 0.1200 76.332 4 acoustic
3 3 6lfxq3CG4xtTiEg7opyCyx Kina Grannis Crazy Rich Asians (Original Motion Picture Sou... Can't Help Falling In Love 71 201933 False 0.266 0.0596 ... -18.515 1 0.0363 0.905000 0.000071 0.1320 0.1430 181.740 3 acoustic
4 4 5vjLSffimiIP26QG5WcN2K Chord Overstreet Hold On Hold On 82 198853 False 0.618 0.4430 ... -9.681 1 0.0526 0.469000 0.000000 0.0829 0.1670 119.949 4 acoustic
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
110995 110995 7sLknEg8aVr0m5ZuCja7b3 NG Rezonance;PHD Syncopy Radio Edits, Vol. 1 Divergence - Radio Edit 28 185142 False 0.148 0.9930 ... -7.696 0 0.0922 0.009700 0.937000 0.0376 0.0928 140.001 4 trance
110996 110996 6veycwSGozeHSFQ6fbr5dC NG Rezonance;PHD Syncopy Radio Edits, Vol. 1 Forgotten - Radio Edit 28 193714 False 0.504 0.9850 ... -7.305 1 0.0504 0.000810 0.922000 0.1250 0.3830 139.978 4 trance
110997 110997 0MLEzWJQcRkc5IMAqucPbV NG Rezonance;Begbie Syncopy Radio Edits, Vol. 1 Feel The Panic - Radio Edit 28 209600 False 0.474 0.9950 ... -4.265 1 0.0979 0.000166 0.369000 0.1500 0.0634 150.002 4 trance
110998 110998 0cRNPYxzXLNLQd1g4kKYS6 NG Rezonance Syncopy Radio Edits, Vol. 1 Fate - Instrumental Radio Edit 28 134800 False 0.416 0.9810 ... -3.653 0 0.0943 0.000079 0.928000 0.1870 0.0662 150.054 3 trance
110999 110999 2dDE3WCSj2cELFYO1IfECD NG Rezonance Syncopy Radio Edits, Vol. 1 Deception - NG Rezonance 2013 Radio Edit 28 162206 False 0.469 0.9870 ... -5.525 0 0.0796 0.000055 0.932000 0.3080 0.2430 145.000 4 trance

68999 rows × 21 columns

In [21]:
df.shape
Out[21]:
(68999, 21)
In [22]:
df.describe()
Out[22]:
Unnamed: 0 popularity duration_ms danceability energy key loudness mode speechiness acousticness instrumentalness liveness valence tempo time_signature
count 68999.000000 68999.000000 6.899900e+04 68999.000000 68999.000000 68999.000000 68999.000000 68999.000000 68999.000000 68999.000000 68999.000000 68999.000000 68999.000000 68999.000000 68999.000000
mean 51426.826476 33.580907 2.252585e+05 0.576138 0.645810 5.325584 -7.996436 0.627009 0.088281 0.302674 0.129505 0.204778 0.477796 122.057113 3.911129
std 31499.289759 24.772096 9.418724e+04 0.168848 0.247882 3.556244 4.958435 0.483603 0.119065 0.329293 0.283917 0.180854 0.254506 29.876655 0.405602
min 0.000000 0.000000 8.586000e+03 0.000000 0.000020 0.000000 -43.957000 0.000000 0.000000 0.000001 0.000000 0.009250 0.000000 0.000000 0.000000
25% 26249.500000 4.000000 1.755505e+05 0.465000 0.488000 2.000000 -9.517500 0.000000 0.036200 0.018500 0.000000 0.096900 0.268000 98.299500 4.000000
50% 46499.000000 35.000000 2.113410e+05 0.588000 0.694000 5.000000 -6.669000 1.000000 0.049100 0.152000 0.000029 0.129000 0.468000 121.820000 4.000000
75% 79749.500000 54.000000 2.560000e+05 0.703000 0.849000 8.000000 -4.839000 1.000000 0.084500 0.558000 0.018200 0.258000 0.682500 140.044000 4.000000
max 110999.000000 100.000000 4.246206e+06 0.980000 1.000000 11.000000 4.532000 1.000000 0.965000 0.996000 0.999000 0.997000 0.995000 243.372000 5.000000

Overall, from this dataset, we can expect to see clusters of tracks with similar features, suggesting common musical genres or styles. For example, clustering analysis could be performed to identify groups of tracks with similar acoustic, energetic, or emotional characteristics, helping music streaming platforms recommend tracks to users based on their preferences.

genre = df['track_genre'].unique()

print(genre)

left with this¶

print(len(genre))

KDE Plot¶

Using seaborn to visualize the distribution of numerical features in the dataset. Each subplot represents the distribution of a specific numerical feature, with a histogram and kernel density estimation (kde).

In [27]:
import seaborn as sns
In [28]:
sns.set_style("darkgrid") 
  
# Identifying numerical columns 
numerical_columns = df.select_dtypes(include=["int64", "float64"]).columns 
  
# Plotting distribution of each numerical feature 
plt.figure(figsize=(14, len(numerical_columns) * 3)) 
for idx, feature in enumerate(numerical_columns, 1): 
    plt.subplot(len(numerical_columns), 2, idx) 
    sns.histplot(df[feature], kde=True) 
    plt.title(f"{feature} | Skewness: {round(df[feature].skew(), 2)}") 
  
# To adjust layout and show plots 
plt.tight_layout() 
plt.show()

The plots provide an overview of the distribution of each numerical feature, facilitating the identification of potential skewness or anomalies. Skewness measures the asymmetry of the data distribution, with values close to zero indicating symmetry, and positive or negative values indicating right or left skew, respectively.

Features with significant skewness may require further preprocessing or transformation to enhance model performance in predictive tasks. In our case, we didn't use time_signature and mode for out model as evident from visualisation.

PAIR PLOT¶

Aagain utilizes seaborn to create a pair plot for the DataFrame. Each scatterplot represents the relationship between two features, with one feature plotted against the other.

In [31]:
# Color palette for seaborn plots
sns.set_palette("Pastel1")

# Figure for the pair plot
plt.figure(figsize=(10, 6))

# Pair plot 
sns.pairplot(df)

#TITLE
plt.suptitle('Pair Plot for DataFrame')

# Save as an image
plt.savefig('pair_plot.png')

# show
plt.show()
<Figure size 1000x600 with 0 Axes>

Scatterplots reveal the patterns, trends, and potential correlations between pairs of features. Diagonal plots show the distribution of individual features, while off-diagonal plots display the relationships between pairs of features.

Correlation Matrix Analysis¶

The code generates a heatmap visualization of the correlation matrix for the numeric columns in the DataFrame. Each cell in the heatmap represents the correlation coefficient between two numeric features.

In [34]:
sns.set(style="whitegrid")

# Only numeric columns from the DataFrame
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Correlation matrix
correlation_matrix = numeric_df.corr()

colors = sns.color_palette('viridis')

# Plotting the Correlation Matrix
plt.figure(figsize=(12, 10))

sns.heatmap(correlation_matrix, annot=True, cmap='viridis', fmt=".2f", linewidths=0.5, vmin=-1, vmax=1, cbar_kws={'label': 'Correlation'}, annot_kws={"size": 10}, square=True)

plt.title('Correlation Matrix', fontsize=16, fontweight='bold')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability if needed
plt.yticks(rotation=0)
plt.show()

The heatmap provides a visual representation of the strength and direction of the linear relationship between pairs of features. Correlation coefficients close to 1 or -1 indicate strong positive or negative correlations, respectively. Values close to 0 indicate weak or no correlation. We decided to: 'Unnamed: 0','popularity', 'instrumentalness', 'explicit', 'duration_ms', drop these columns based on the matrix.

In [36]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 68999 entries, 0 to 110999
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        68999 non-null  int64  
 1   track_id          68999 non-null  object 
 2   artists           68999 non-null  object 
 3   album_name        68999 non-null  object 
 4   track_name        68999 non-null  object 
 5   popularity        68999 non-null  int64  
 6   duration_ms       68999 non-null  int64  
 7   explicit          68999 non-null  bool   
 8   danceability      68999 non-null  float64
 9   energy            68999 non-null  float64
 10  key               68999 non-null  int64  
 11  loudness          68999 non-null  float64
 12  mode              68999 non-null  int64  
 13  speechiness       68999 non-null  float64
 14  acousticness      68999 non-null  float64
 15  instrumentalness  68999 non-null  float64
 16  liveness          68999 non-null  float64
 17  valence           68999 non-null  float64
 18  tempo             68999 non-null  float64
 19  time_signature    68999 non-null  int64  
 20  track_genre       68999 non-null  object 
dtypes: bool(1), float64(9), int64(6), object(5)
memory usage: 11.1+ MB

Now the dataset contains 68,999 entries and 21 columns.

Dropping irrelavant columns¶

In [39]:
df = df.drop(['Unnamed: 0','popularity', 'instrumentalness', 'explicit', 'duration_ms', 'key'], axis=1)

df

In [41]:
df.shape
Out[41]:
(68999, 15)

Transforming¶

First feature scaling is applied to the selected numerical columns in the DataFrame using the StandardScaler from scikit-learn. It first initializes the scaler object and specifies the columns to be scaled. Then, it transforms the selected columns using the fit_transform method of the scaler object.

In [43]:
scaler = StandardScaler()
In [44]:
number_cols = ['valence', 'acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo']
In [45]:
df[number_cols] = scaler.fit_transform(df[number_cols])
In [46]:
df
Out[46]:
track_id artists album_name track_name danceability energy loudness mode speechiness acousticness liveness valence tempo time_signature track_genre
0 5SuOikwiRyPMVoIQDJUgSV Gen Hoshino Comedy Comedy 0.591433 -0.745560 0.252185 -1.296546 0.459572 -0.821385 0.847224 0.932025 -1.142710 4 acoustic
1 4qPNDBW1i3p13qLCt0Ki3A Ben Woodward Ghost (Acoustic) Ghost - Acoustic -0.924730 -1.935649 -1.863215 0.771280 -0.100629 1.886861 -0.573825 -0.828263 -1.491748 4 acoustic
2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN To Begin Again To Begin Again -0.818125 -1.157048 -0.350428 0.771280 -0.273644 -0.281437 -0.485355 -1.405857 -1.530474 4 acoustic
3 6lfxq3CG4xtTiEg7opyCyx Kina Grannis Crazy Rich Asians (Original Motion Picture Sou... Can't Help Falling In Love -1.836797 -2.364888 -2.121363 0.771280 -0.436581 1.829161 -0.402414 -1.315485 1.997657 3 acoustic
4 5vjLSffimiIP26QG5WcN2K Chord Overstreet Hold On Hold On 0.247927 -0.818176 -0.339740 0.771280 -0.299681 0.505103 -0.673906 -1.221184 -0.070561 4 acoustic
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
110995 7sLknEg8aVr0m5ZuCja7b3 NG Rezonance;PHD Syncopy Radio Edits, Vol. 1 Divergence - Radio Edit -2.535654 1.400634 0.060591 -1.296546 0.032912 -0.889714 -0.924387 -1.512732 0.600603 4 trance
110996 6veycwSGozeHSFQ6fbr5dC NG Rezonance;PHD Syncopy Radio Edits, Vol. 1 Forgotten - Radio Edit -0.427239 1.368360 0.139447 0.771280 -0.318158 -0.916711 -0.441120 -0.372474 0.599833 4 trance
110997 0MLEzWJQcRkc5IMAqucPbV NG Rezonance;Begbie Syncopy Radio Edits, Vol. 1 Feel The Panic - Radio Edit -0.604915 1.408702 0.752549 0.771280 0.080785 -0.918667 -0.302886 -1.628251 0.935349 4 trance
110998 0cRNPYxzXLNLQd1g4kKYS6 NG Rezonance Syncopy Radio Edits, Vol. 1 Fate - Instrumental Radio Edit -0.948420 1.352224 0.875975 -1.296546 0.050550 -0.918932 -0.098299 -1.617249 0.937089 3 trance
110999 2dDE3WCSj2cELFYO1IfECD NG Rezonance Syncopy Radio Edits, Vol. 1 Deception - NG Rezonance 2013 Radio Edit -0.634527 1.376429 0.498434 -1.296546 -0.072913 -0.919005 0.570755 -0.922564 0.767926 4 trance

68999 rows × 15 columns

Elbow Method¶

This function, optimise_k_means, aims to find the optimal number of clusters for K-means clustering by generating an elbow plot. It iterates over a range of cluster numbers from 1 to max_k, fits a K-means model for each cluster number, and calculates the inertia (within-cluster sum of squared distances to the nearest cluster center). The inertia is then plotted against the number of clusters to visualize the elbow point, which represents the optimal number of clusters where the inertia starts to decrease at a slower rate.

In [48]:
def optimise_k_means(data, max_k):
    means = []
    inertias = []
    
    for k in range(1, max_k):
        kmeans = KMeans(n_clusters=k)
        kmeans.fit(data)
        
        means.append(k)
        inertias.append(kmeans.inertia_)
        
    # Generate the elbow plot
    fig = plt.subplots(figsize=(10,5))
    plt.plot(means, inertias, 'o-')
    plt.xlabel('Number of Clusters')
    plt.ylabel('Inertia')
    plt.grid(True)
    plt.show()
In [49]:
optimise_k_means(df[number_cols], 10)

From the graph 7 seems perfect for us as then the line starts damping into an asymptote.

Clustering¶

Clustering with K-Means

Here, the simple K-means clustering algorithm was used to divide his dataset into seven clusters based on the numerical audio features of each genres.

In [52]:
kmeans = KMeans(n_clusters=7)
In [53]:
kmeans.fit(df[number_cols])
Out[53]:
KMeans(n_clusters=7)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=7)
In [54]:
df['kmeans_7'] = kmeans.labels_
In [55]:
df
Out[55]:
track_id artists album_name track_name danceability energy loudness mode speechiness acousticness liveness valence tempo time_signature track_genre kmeans_7
0 5SuOikwiRyPMVoIQDJUgSV Gen Hoshino Comedy Comedy 0.591433 -0.745560 0.252185 -1.296546 0.459572 -0.821385 0.847224 0.932025 -1.142710 4 acoustic 1
1 4qPNDBW1i3p13qLCt0Ki3A Ben Woodward Ghost (Acoustic) Ghost - Acoustic -0.924730 -1.935649 -1.863215 0.771280 -0.100629 1.886861 -0.573825 -0.828263 -1.491748 4 acoustic 3
2 1iJBSr7s7jYXzM8EGcbK5b Ingrid Michaelson;ZAYN To Begin Again To Begin Again -0.818125 -1.157048 -0.350428 0.771280 -0.273644 -0.281437 -0.485355 -1.405857 -1.530474 4 acoustic 6
3 6lfxq3CG4xtTiEg7opyCyx Kina Grannis Crazy Rich Asians (Original Motion Picture Sou... Can't Help Falling In Love -1.836797 -2.364888 -2.121363 0.771280 -0.436581 1.829161 -0.402414 -1.315485 1.997657 3 acoustic 3
4 5vjLSffimiIP26QG5WcN2K Chord Overstreet Hold On Hold On 0.247927 -0.818176 -0.339740 0.771280 -0.299681 0.505103 -0.673906 -1.221184 -0.070561 4 acoustic 6
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
110995 7sLknEg8aVr0m5ZuCja7b3 NG Rezonance;PHD Syncopy Radio Edits, Vol. 1 Divergence - Radio Edit -2.535654 1.400634 0.060591 -1.296546 0.032912 -0.889714 -0.924387 -1.512732 0.600603 4 trance 0
110996 6veycwSGozeHSFQ6fbr5dC NG Rezonance;PHD Syncopy Radio Edits, Vol. 1 Forgotten - Radio Edit -0.427239 1.368360 0.139447 0.771280 -0.318158 -0.916711 -0.441120 -0.372474 0.599833 4 trance 0
110997 0MLEzWJQcRkc5IMAqucPbV NG Rezonance;Begbie Syncopy Radio Edits, Vol. 1 Feel The Panic - Radio Edit -0.604915 1.408702 0.752549 0.771280 0.080785 -0.918667 -0.302886 -1.628251 0.935349 4 trance 0
110998 0cRNPYxzXLNLQd1g4kKYS6 NG Rezonance Syncopy Radio Edits, Vol. 1 Fate - Instrumental Radio Edit -0.948420 1.352224 0.875975 -1.296546 0.050550 -0.918932 -0.098299 -1.617249 0.937089 3 trance 0
110999 2dDE3WCSj2cELFYO1IfECD NG Rezonance Syncopy Radio Edits, Vol. 1 Deception - NG Rezonance 2013 Radio Edit -0.634527 1.376429 0.498434 -1.296546 -0.072913 -0.919005 0.570755 -0.922564 0.767926 4 trance 0

68999 rows × 16 columns

Visualisation¶

Visulaising the clusters identified by KMeans clustering using t-SNE (t-distributed Stochastic Neighbor Embedding). It is a dimensionality reduction technique that helps in visualizing high-dimensional datasets by mapping them to two or three dimensions while preserving their structural integrity.

  1. We begin by preparing the data, which includes numerical columns representing various track features such as tempo, loudness, and danceability, along with columns for track genres and cluster labels from KMeans clustering with seven clusters. To ensure each feature contributes equally to t-SNE's distance calculations, we standardize the data using StandardScaler from scikit-learn. We then apply t-SNE to the standardized data, reducing its dimensionality to two for visualization purposes. The resulting 2D t-SNE embedding is visualized using Plotly, with points colored based on their cluster labels.
In [58]:
# Visualizing the Clusters with t-SNE
import plotly.express as px

from sklearn.manifold import TSNE
from sklearn.pipeline import Pipeline

# Pipeline to standardize the data and then apply t-SNE
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])

# Fiting the pipeline to the data and transform it to get the t-SNE embedding
genre_embedding = tsne_pipeline.fit_transform(df[number_cols])

# DataFrame to hold the t-SNE projection
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)

# Adding the genre and cluster information
projection['genres'] = df['track_genre']
projection['cluster'] = df['kmeans_7']

# Scatter plot to visualize the t-SNE projection colored by clusters
fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 68999 samples in 0.102s...
[t-SNE] Computed neighbors for 68999 samples in 22.347s...
[t-SNE] Computed conditional probabilities for sample 1000 / 68999
[t-SNE] Computed conditional probabilities for sample 2000 / 68999
[t-SNE] Computed conditional probabilities for sample 3000 / 68999
[t-SNE] Computed conditional probabilities for sample 4000 / 68999
[t-SNE] Computed conditional probabilities for sample 5000 / 68999
[t-SNE] Computed conditional probabilities for sample 6000 / 68999
[t-SNE] Computed conditional probabilities for sample 7000 / 68999
[t-SNE] Computed conditional probabilities for sample 8000 / 68999
[t-SNE] Computed conditional probabilities for sample 9000 / 68999
[t-SNE] Computed conditional probabilities for sample 10000 / 68999
[t-SNE] Computed conditional probabilities for sample 11000 / 68999
[t-SNE] Computed conditional probabilities for sample 12000 / 68999
[t-SNE] Computed conditional probabilities for sample 13000 / 68999
[t-SNE] Computed conditional probabilities for sample 14000 / 68999
[t-SNE] Computed conditional probabilities for sample 15000 / 68999
[t-SNE] Computed conditional probabilities for sample 16000 / 68999
[t-SNE] Computed conditional probabilities for sample 17000 / 68999
[t-SNE] Computed conditional probabilities for sample 18000 / 68999
[t-SNE] Computed conditional probabilities for sample 19000 / 68999
[t-SNE] Computed conditional probabilities for sample 20000 / 68999
[t-SNE] Computed conditional probabilities for sample 21000 / 68999
[t-SNE] Computed conditional probabilities for sample 22000 / 68999
[t-SNE] Computed conditional probabilities for sample 23000 / 68999
[t-SNE] Computed conditional probabilities for sample 24000 / 68999
[t-SNE] Computed conditional probabilities for sample 25000 / 68999
[t-SNE] Computed conditional probabilities for sample 26000 / 68999
[t-SNE] Computed conditional probabilities for sample 27000 / 68999
[t-SNE] Computed conditional probabilities for sample 28000 / 68999
[t-SNE] Computed conditional probabilities for sample 29000 / 68999
[t-SNE] Computed conditional probabilities for sample 30000 / 68999
[t-SNE] Computed conditional probabilities for sample 31000 / 68999
[t-SNE] Computed conditional probabilities for sample 32000 / 68999
[t-SNE] Computed conditional probabilities for sample 33000 / 68999
[t-SNE] Computed conditional probabilities for sample 34000 / 68999
[t-SNE] Computed conditional probabilities for sample 35000 / 68999
[t-SNE] Computed conditional probabilities for sample 36000 / 68999
[t-SNE] Computed conditional probabilities for sample 37000 / 68999
[t-SNE] Computed conditional probabilities for sample 38000 / 68999
[t-SNE] Computed conditional probabilities for sample 39000 / 68999
[t-SNE] Computed conditional probabilities for sample 40000 / 68999
[t-SNE] Computed conditional probabilities for sample 41000 / 68999
[t-SNE] Computed conditional probabilities for sample 42000 / 68999
[t-SNE] Computed conditional probabilities for sample 43000 / 68999
[t-SNE] Computed conditional probabilities for sample 44000 / 68999
[t-SNE] Computed conditional probabilities for sample 45000 / 68999
[t-SNE] Computed conditional probabilities for sample 46000 / 68999
[t-SNE] Computed conditional probabilities for sample 47000 / 68999
[t-SNE] Computed conditional probabilities for sample 48000 / 68999
[t-SNE] Computed conditional probabilities for sample 49000 / 68999
[t-SNE] Computed conditional probabilities for sample 50000 / 68999
[t-SNE] Computed conditional probabilities for sample 51000 / 68999
[t-SNE] Computed conditional probabilities for sample 52000 / 68999
[t-SNE] Computed conditional probabilities for sample 53000 / 68999
[t-SNE] Computed conditional probabilities for sample 54000 / 68999
[t-SNE] Computed conditional probabilities for sample 55000 / 68999
[t-SNE] Computed conditional probabilities for sample 56000 / 68999
[t-SNE] Computed conditional probabilities for sample 57000 / 68999
[t-SNE] Computed conditional probabilities for sample 58000 / 68999
[t-SNE] Computed conditional probabilities for sample 59000 / 68999
[t-SNE] Computed conditional probabilities for sample 60000 / 68999
[t-SNE] Computed conditional probabilities for sample 61000 / 68999
[t-SNE] Computed conditional probabilities for sample 62000 / 68999
[t-SNE] Computed conditional probabilities for sample 63000 / 68999
[t-SNE] Computed conditional probabilities for sample 64000 / 68999
[t-SNE] Computed conditional probabilities for sample 65000 / 68999
[t-SNE] Computed conditional probabilities for sample 66000 / 68999
[t-SNE] Computed conditional probabilities for sample 67000 / 68999
[t-SNE] Computed conditional probabilities for sample 68000 / 68999
[t-SNE] Computed conditional probabilities for sample 68999 / 68999
[t-SNE] Mean sigma: 0.000000
[t-SNE] KL divergence after 250 iterations with early exaggeration: 101.086151
[t-SNE] KL divergence after 1000 iterations: 2.287273
In [59]:
import numpy as np
# Song clustering Pipeline
song_cluster_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardizing
    ('kmeans', KMeans(n_clusters=7, verbose=False))  #K-means clustering
], verbose=False)

# Select numerical columns from the data
X = df.select_dtypes(np.number)
number_cols = list(X.columns)

# Fit the pipeline to the data
song_cluster_pipeline.fit(X)

# Predict cluster labels for the data
song_cluster_labels = song_cluster_pipeline.predict(X)

# Assign cluster labels to the original dataset
df['cluster_label'] = song_cluster_labels
In [60]:
# Visualizing the Clusters with PCA

from sklearn.decomposition import PCA

# Pipeline for PCA
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])

# FitING the pipeline to the data and transform it to get the PCA embedding
song_embedding = pca_pipeline.fit_transform(X)
# DataFrame to hold the PCA projection
projection = pd.DataFrame(song_embedding, columns=['x', 'y'])

# track title and cluster information to the projection DataFrame
projection['title'] = df['track_name']
projection['cluster'] = df['cluster_label']

# Scatter plot to visualize the PCA projection colored by clusters
fig = px.scatter(
    projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()

Recommendation¶

Based on the analysis and visualizations, it's evident that songs belonging to similar genres tend to cluster together, indicating a correlation between similarity and the proximity of data points. This observation is intuitive as songs within the same genre typically share similar musical characteristics and are often produced within similar time periods. Leveraging this insight, we can develop a recommendation system by using the data points of songs a user has listened to and suggesting songs that are located near those data points.

Spotipy¶

Spotipy, a Python client for the Spotify Web API, facilitates the retrieval of data and querying of Spotify's extensive catalog of songs.

In [63]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Replace 'secret' with  Spotify client secret
cid = '778a29e66e464a668d6ed5419a05ec3d'
secret = '937d60c66cae42a980dcdbd9fe4968bd'

# Client credentials manager
client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)

# Spotify client
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

The code begins by utilizing the modules to create a dictionary that provides a default value when a key is not present. The find_song function is defined to take a single argument, name, representing the song name to search for. Within this function, a defaultdict named song_data is initialized with list as its default factory function, ensuring that non-existent keys return an empty list. The function then filters a DataFrame df to create song_info, containing rows where the track_name matches the provided name. If no matching song is found (song_info.empty), the function returns None. Otherwise, the song’s track name and artists are appended to their respective lists in song_data. The function also iterates over a predefined list of audio feature keys, appending corresponding values from song_info to song_data. Finally, a DataFrame is created from song_data and returned. An example usage of the function is provided, where it is called with the song name "Come As You Are", and the result is printed. The accompanying markdown report provides a comprehensive overview of the retrieved song details, including the artist and various audio features.

In [65]:
from collections import defaultdict
import pandas as pd

def find_song(artist, track_name):
    # Initializing a defaultdict
    song_data = defaultdict(list)

    # Filter the DataFrame by artist and track name
    song_info = df[(df['track_name'] == track_name) & (df['artists'] == artist)]

    # If no matching song is found, return None
    if song_info.empty:
        return None

    # Adding song information to the dictionary
    song_data['name'].append(song_info.iloc[0]['track_name'])
    song_data['artists'].append(song_info.iloc[0]['artists'])

    # Adding audio features to the dictionary
    for key in ['acousticness', 'danceability', 'energy', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'valence']:
        song_data[key].append(song_info.iloc[0][key])

    # Creating a DataFrame from the dictionary
    return pd.DataFrame(song_data)

# Example usage:
artist_name = "Nirvana"
track_name = "Come As You Are"
song_details = find_song(artist_name, track_name)
print(song_details)
              name  artists  acousticness  danceability    energy  liveness  \
0  Come As You Are  Nirvana     -0.918685     -0.450929  0.718854 -0.625801   

   loudness      mode  speechiness    tempo   valence  
0  0.433696 -1.296546    -0.415584 -0.06467  0.240484  

The similarity is determined based on the mean vector of the input songs and the cosine distance between this vector and the vectors of other songs in the dataset.

  1. Get Song Data: Retrieves data for a specific song from the dataset.

  2. Get Mean Vector: Calculates the mean vector for a list of songs. It uses the SimpleImputer to handle any missing values in the song data. The mean vector represents the average feature values of the input songs.

  3. Flatten Dictionary List: Converts a list of dictionaries into a dictionary of lists.

  4. Recommend Songs: Recommends a specified number of songs based on the input song list. It calculates the mean vector for the input songs, scales the data, computes pairwise cosine distances, and identifies the closest songs in the dataset. It then returns the recommended songs.

In [66]:
from collections import defaultdict
from sklearn.metrics import pairwise_distances
from sklearn.impute import SimpleImputer
import numpy as np


# Function to get song data from the dataset
def get_song_data(song, your_data):
    try:
        # Find the song in the dataset
        song_data = your_data[(your_data['track_name'] == song['name']) & (your_data['artists'] == song['artist'])].iloc[0]
        return song_data
    except IndexError:
        return None

    
# Function to calculate the mean vector of a list of songs
def get_mean_vector(song_list, your_data):
    song_vectors = []
    imputer = SimpleImputer(strategy='mean') # Impute missing values with the mean
    
    for song in song_list:
        song_data = get_song_data(song, your_data)
        if song_data is None:
            print('Warning: {} by {} does not exist in your dataset'.format(song['name'], song['artist']))
            continue
        # Impute missing values and flatten the song vector
        song_data_imputed = imputer.fit_transform(song_data[number_cols].values.reshape(1, -1))
        song_vectors.append(song_data_imputed.flatten())
    # Create a matrix from song vectors and calculate the mean vector    
    song_matrix = np.array(list(song_vectors))
    return np.mean(song_matrix, axis=0)

# Function to flatten a list of dictionaries into a dictionary of lists
def flatten_dict_list(dict_list):
    flattened_dict = defaultdict(list)
    for key in dict_list[0].keys():
        flattened_dict[key] = []
    
    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)
            
    return flattened_dict

# Function to recommend songs based on a list of input songs
def recommend_songs(song_list, your_data, n_songs=10):
    metadata_cols = ['track_name', 'artists']
    song_dict = flatten_dict_list(song_list)
    
    # Calculate the mean vector for the input songs
    song_center = get_mean_vector(song_list, your_data)
    scaler = song_cluster_pipeline.steps[0][1]
    # Only include relevant columns for scaling
    scaled_data = scaler.transform(your_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))
    distances = pairwise_distances(scaled_song_center, scaled_data, metric='cosine')
    index = list(np.argsort(distances)[:, :n_songs][0])
    
    rec_songs = your_data.iloc[index]
    rec_songs = rec_songs[~rec_songs['track_name'].isin(song_dict['name']) & ~rec_songs['artists'].isin(song_dict['artist'])]
    return rec_songs[metadata_cols].to_dict(orient='records')

# Example usage:
song_list = [{'name': 'Moonlight', 'artist': 'XXXTENTACION'}, 
             {'name': 'PTSD', 'artist': 'G Herbo'},
             {'name': 'Lucid Dreams', 'artist': 'Juice WRLD'}]  # List of songs

# Get recommended songs
recommended_songs = recommend_songs(song_list, df, n_songs=10)
print("Recommended Songs:")
for index, song in enumerate(recommended_songs, start=1):
    print(f"{index}. {song['track_name']} by {song['artists']}")
Warning: PTSD by G Herbo does not exist in your dataset
Warning: Lucid Dreams by Juice WRLD does not exist in your dataset
Recommended Songs:
1. 03' by Sainte
2. Ganha o Mundo by MC Hariel;Mc Dimenor Dr
3. Teu Herói by Weliton O Gordinho;Thiago Aquino
4. Passive Aggressive by Nate Traveller
5. bingo by Don L
6. Rebola Lentin (feat. Mc Kaio) by Bonde do gato preto;Mc Kaio
7. Sento de Repente by Bonde do gato preto
In [ ]: